1 Data load and preprocessing

# Install the data using seurat-disk below if data("panc8") does not exist,
# InstallData("panc8")

data("panc8")
pancreas.list <- SplitObject(panc8, split.by = "tech")
pancreas.list <-
  pancreas.list[c("celseq", "celseq2", "fluidigmc1", "smartseq2")]

# normally, we use Read10X or read.table or read.csv
#dat.1 <- Read10X(data.dir = "")
#dat.2 <- Read10X(data.dir = "")

#dat.1.obj <- CreateSeuratObject(counts = dat.1, project = "stim", min.cells = 3, min.features = 200)
#dat.2.obj <- CreateSeuratObject(counts = dat.2, project = "ctrl", min.cells = 3, min.features = 200)

#Give a group name ("treatment") and sample labels (stil and ctrl) to both data
#dat.1.obj$treatment="stim"
#dat.2.obj$treatment="ctrl"
for (i in 1:length(pancreas.list)) {
  pancreas.list[[i]] <-
    NormalizeData(pancreas.list[[i]], verbose = FALSE)
  pancreas.list[[i]] <-
    FindVariableFeatures(
      pancreas.list[[i]],
      selection.method = "vst",
      nfeatures = 2000,
      verbose = FALSE
    )
}

reference.list <- pancreas.list[c("celseq", "celseq2", "smartseq2")]

2 Multiple data integration

# Find integration anchors and integrate data
pancreas.anchors <-
  FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
pancreas.integrated <-
  IntegrateData(anchorset = pancreas.anchors, dims = 1:30)

# switch to integrated assay.
DefaultAssay(pancreas.integrated) <- "integrated"

# Run the standard workflow for visualization and clustering
pancreas.integrated <-
  ScaleData(pancreas.integrated, verbose = FALSE)
pancreas.integrated <-
  RunPCA(pancreas.integrated, npcs = 30, verbose = FALSE)
pancreas.integrated <-
  RunUMAP(
    pancreas.integrated,
    reduction = "pca",
    dims = 1:30,
    verbose = FALSE
  )

# Number of cells in each condition
table(pancreas.integrated$tech)
## 
##    celseq   celseq2 smartseq2 
##      1004      2285      2394
table(pancreas.integrated@meta.data$tech,
      pancreas.integrated@meta.data$celltype)
##            
##             acinar activated_stellate alpha beta delta ductal endothelial
##   celseq       229                 19   213  161    50    304           5
##   celseq2      274                 90   844  445   203    257          21
##   smartseq2    188                 55  1008  308   127    444          21
##            
##             epsilon gamma macrophage mast quiescent_stellate schwann
##   celseq          1    18          1    1                  1       1
##   celseq2         4   110         15    6                 12       4
##   smartseq2       8   213          7    7                  6       2
# Visualization
p1 <-
  DimPlot(pancreas.integrated,
          reduction = "umap",
          group.by = "tech")

p2 <-
  DimPlot(
    pancreas.integrated,
    reduction = "umap",
    group.by = "celltype",
    label = TRUE,
    repel = TRUE
  ) + NoLegend()

p1 + p2

p3 <-
  DimPlot(pancreas.integrated,
          reduction = "umap",
          group.by = "celltype")

LabelClusters(
  p3,
  id = "celltype",
  color = unique(ggplot_build(p3)$data[[1]]$colour),
  size = 5,
  repel = T,
  box.padding = 1.25
)

DimPlot(
  pancreas.integrated,
  reduction = "umap",
  split.by = "tech",
  label = TRUE
)

3 Find marker genes

# Remember to switch to raw data for DEG
DefaultAssay(pancreas.integrated) <- "RNA"

# Find DEGs in celseq
celseq.markers <-
  FindMarkers(
    pancreas.integrated,
    ident.1 = "celseq",
    group.by = "tech",
    logfc.threshold = 0.25,
    only.pos = TRUE
  )

# Find DEGs in alpha
Idents(pancreas.integrated) <- "celltype"
alpha.markers <-
  FindMarkers(
    pancreas.integrated,
    ident.1 = "acinar",
    logfc.threshold = 0.25,
    only.pos = TRUE
  )

# Find conserved DEGs among techs
conserve.markers <-
  FindConservedMarkers(pancreas.integrated,
                       ident.1 = c("acinar"),
                       grouping.var = "tech")

# Find DEGs for all techs
Idents(pancreas.integrated) <- "tech"
all.markers <-
  FindAllMarkers(pancreas.integrated,
                 logfc.threshold = 0.25,
                 only.pos = TRUE)
top_10_marker <-
  all.markers %>% group_by(cluster) %>% top_n(n = 10, avg_log2FC)
head(top_10_marker)
## # A tibble: 6 x 7
## # Groups:   cluster [1]
##   p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene      
##   <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>     
## 1     0       5.60 0.581 0.006         0 celseq  INS-IGF2  
## 2     0       3.70 1     0.488         0 celseq  ERCC-00002
## 3     0       3.39 1     0.488         0 celseq  ERCC-00096
## 4     0       3.10 0.994 0.329         0 celseq  ERCC-00136
## 5     0       3.07 1     0.488         0 celseq  ERCC-00046
## 6     0       3.05 0.995 0.848         0 celseq  RPS18
# Draw heatmap
DoHeatmap(
  pancreas.integrated,
  features = top_10_marker$gene,
  slot = "counts",
  size = 4
) +
  scale_fill_gradientn(colors = RColorBrewer::brewer.pal(n = 9, name = "RdBu"))

4 Marker gene visualizations

VlnPlot(pancreas.integrated, features = c("REG1A"))

VlnPlot(pancreas.integrated,
        features = c("REG1A"),
        group.by = "celltype")

VlnPlot(
  pancreas.integrated,
  features = c("REG1A"),
  group.by = "celltype",
  split.by = "tech"
)

pancreas.integrated.sub <-
  subset(pancreas.integrated, idents = c("celseq", "celseq2"))

VlnPlot(
  pancreas.integrated.sub,
  features = c("REG1A"),
  group.by = "celltype",
  split.by = "tech",
  cols = c("red", "grey", "blue"),
  pt.size = 0
)

FeaturePlot(
  pancreas.integrated,
  features = c("REG1A"),
  split.by = "tech",
  max.cutoff = 3,
  cols = c("grey", "red")
)

5 Reference mapping and cell type classification

Seurat also supports the projection of reference data (or meta data) onto a query object. While many of the methods are conserved (both procedures begin by identifying anchors), there are two important distinctions between data transfer and integration:

In data transfer, Seurat does not correct or modify the query expression data. In data transfer, Seurat has an option (set by default) to project the PCA structure of a reference onto the query, instead of learning a joint structure with CCA. We generally suggest using this option when projecting data between scRNA-seq datasets.

# Use the integrated assay
DefaultAssay(pancreas.integrated) <- "integrated"


# setup the reference (query) object list (optional)
pancreas.query <- pancreas.list[["fluidigmc1"]]

# Find anchors for transfer
pancreas.anchors <-
  FindTransferAnchors(
    reference = pancreas.integrated,
    query = pancreas.query,
    dims = 1:30,
    reference.reduction = "pca"
  )

predictions <-
  TransferData(anchorset = pancreas.anchors,
               refdata = pancreas.integrated$celltype,
               dims = 1:30)

pancreas.query <-
  AddMetaData(pancreas.query, metadata = predictions)

pancreas.query$prediction.match <-
  pancreas.query$predicted.id == pancreas.query$celltype

table(pancreas.query$prediction.match)
## 
## FALSE  TRUE 
##    21   617
pancreas.integrated <-
  RunUMAP(
    pancreas.integrated,
    dims = 1:30,
    reduction = "pca",
    return.model = TRUE
  )
pancreas.query <-
  MapQuery(
    anchorset = pancreas.anchors,
    reference = pancreas.integrated,
    query = pancreas.query,
    refdata = list(celltype = "celltype"),
    reference.reduction = "pca",
    reduction.model = "umap"
  )

p1 <-
  DimPlot(
    pancreas.integrated,
    reduction = "umap",
    group.by = "celltype",
    label = TRUE,
    label.size = 3,
    repel = TRUE
  ) + NoLegend() + ggtitle("Reference annotations")
p2 <-
  DimPlot(
    pancreas.query,
    reduction = "ref.umap",
    group.by = "predicted.celltype",
    label = TRUE,
    label.size = 3,
    repel = TRUE
  ) + NoLegend() + ggtitle("Query transferred labels")

p1 + p2

VlnPlot(pancreas.query,
        features = c("REG1A"),
        group.by = "celltype")

pancreas.merge <- merge(pancreas.integrated, pancreas.query)

VlnPlot(
  pancreas.merge,
  features = c("REG1A"),
  group.by = "celltype",
  split.by = "tech",
  cols = c("red", "grey", "blue", "green"),
  pt.size = 0
)

6 Darw a sankey plot (not covered by Seurat)

sankey.dat <-
  data.frame(
    source = pancreas.query$predicted.id,
    target = pancreas.query$celltype,
    value = rep(1, length(pancreas.query$celltype))
  )

sankey.dat$new <- paste(sankey.dat$source, sankey.dat$target)

# create a connecting data frame of label pair frequencies
sankey.link  <- aggregate(value ~ new, sankey.dat, sum)
sankey.link <-
  separate(
    sankey.link ,
    col = new,
    into = c("source", "target"),
    sep = " "
  )
sankey.link$target <- paste(sankey.link$target, " ", sep = "")

# create a node data frame of all unique labels
sankey.nodes <- data.frame(name = c(
  as.character(sankey.link$source),
  as.character(sankey.link$target)
) %>% unique())

# transfer target and source names to node numbers
sankey.link$IDsource <-
  match(sankey.link$source, sankey.nodes$name) - 1
sankey.link$IDtarget <-
  match(sankey.link$target, sankey.nodes$name) - 1

p <- sankeyNetwork(
  Links = sankey.link,
  Nodes = sankey.nodes,
  Source = "IDsource",
  Target = "IDtarget",
  Value = "value",
  NodeID = "name",
  sinksRight = FALSE,
  fontSize = 15,
  nodeWidth = 40,
  nodePadding = 10
)
p
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server 7.9 (Maipo)
## 
## Matrix products: default
## BLAS/LAPACK: /opt/intel/19.0.5/compilers_and_libraries_2019.5.281/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] networkD3_0.4               RCurl_1.98-1.4             
##  [3] cowplot_1.1.1               scales_1.1.1               
##  [5] Matrix_1.3-4                forcats_0.5.0              
##  [7] stringr_1.4.0               dplyr_1.0.1                
##  [9] purrr_0.3.4                 readr_1.3.1                
## [11] tidyr_1.1.1                 tibble_3.0.3               
## [13] ggplot2_3.3.5               tidyverse_1.3.0            
## [15] SeuratObject_4.0.2          Seurat_4.0.4               
## [17] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0              GenomicRanges_1.42.0       
## [21] GenomeInfoDb_1.26.7         IRanges_2.24.1             
## [23] S4Vectors_0.28.1            BiocGenerics_0.36.1        
## [25] MatrixGenerics_1.2.1        matrixStats_0.60.1         
## [27] panc8.SeuratData_3.0.2      SeuratData_0.2.1           
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        sn_2.0.0              
##   [4] plyr_1.8.6             igraph_1.2.6           lazyeval_0.2.2        
##   [7] splines_4.0.2          listenv_0.8.0          scattermore_0.7       
##  [10] TH.data_1.1-0          digest_0.6.25          htmltools_0.5.2       
##  [13] fansi_0.4.1            magrittr_1.5           tensor_1.5            
##  [16] cluster_2.1.0          ROCR_1.0-11            limma_3.46.0          
##  [19] globals_0.14.0         modelr_0.1.8           sandwich_3.0-1        
##  [22] spatstat.sparse_2.0-0  colorspace_1.4-1       rvest_0.3.6           
##  [25] blob_1.2.1             rappdirs_0.3.1         ggrepel_0.9.1         
##  [28] rbibutils_2.2.4        haven_2.3.1            xfun_0.16             
##  [31] crayon_1.3.4           jsonlite_1.7.0         spatstat.data_2.1-0   
##  [34] survival_3.1-12        zoo_1.8-9              glue_1.4.1            
##  [37] polyclip_1.10-0        gtable_0.3.0           zlibbioc_1.36.0       
##  [40] XVector_0.30.0         leiden_0.3.9           DelayedArray_0.16.3   
##  [43] future.apply_1.8.1     abind_1.4-5            mvtnorm_1.1-1         
##  [46] DBI_1.1.0              miniUI_0.1.1.1         Rcpp_1.0.7            
##  [49] plotrix_3.8-1          metap_1.5              viridisLite_0.4.0     
##  [52] xtable_1.8-4           tmvnsim_1.0-2          reticulate_1.20       
##  [55] spatstat.core_2.3-0    htmlwidgets_1.5.3      httr_1.4.2            
##  [58] RColorBrewer_1.1-2     TFisher_0.2.0          ellipsis_0.3.1        
##  [61] ica_1.0-2              farver_2.0.3           pkgconfig_2.0.3       
##  [64] uwot_0.1.10            dbplyr_1.4.4           deldir_0.1-28         
##  [67] utf8_1.1.4             labeling_0.3           tidyselect_1.1.0      
##  [70] rlang_0.4.11           reshape2_1.4.4         later_1.1.0.1         
##  [73] cellranger_1.1.0       munsell_0.5.0          tools_4.0.2           
##  [76] cli_2.0.2              generics_0.0.2         mathjaxr_1.4-0        
##  [79] broom_0.7.9            ggridges_0.5.3         evaluate_0.14         
##  [82] fastmap_1.1.0          yaml_2.2.1             goftest_1.2-2         
##  [85] fs_1.5.0               knitr_1.29             fitdistrplus_1.1-5    
##  [88] RANN_2.6.1             pbapply_1.4-3          future_1.22.1         
##  [91] nlme_3.1-148           mime_0.9               xml2_1.3.2            
##  [94] rstudioapi_0.11        compiler_4.0.2         plotly_4.9.4.1        
##  [97] png_0.1-7              spatstat.utils_2.2-0   reprex_0.3.0          
## [100] stringi_1.4.6          lattice_0.20-41        multtest_2.46.0       
## [103] vctrs_0.3.2            mutoss_0.1-12          pillar_1.4.6          
## [106] lifecycle_0.2.0        Rdpack_2.1.2           spatstat.geom_2.2-2   
## [109] lmtest_0.9-38          RcppAnnoy_0.0.19       data.table_1.14.0     
## [112] bitops_1.0-7           irlba_2.3.3            httpuv_1.5.4          
## [115] patchwork_1.1.1        R6_2.4.1               promises_1.1.1        
## [118] KernSmooth_2.23-17     gridExtra_2.3          parallelly_1.27.0     
## [121] codetools_0.2-16       MASS_7.3-51.6          assertthat_0.2.1      
## [124] withr_2.2.0            mnormt_2.0.2           sctransform_0.3.2     
## [127] multcomp_1.4-17        GenomeInfoDbData_1.2.4 mgcv_1.8-31           
## [130] hms_0.5.3              grid_4.0.2             rpart_4.1-15          
## [133] rmarkdown_2.3          Rtsne_0.15             numDeriv_2016.8-1.1   
## [136] shiny_1.5.0            lubridate_1.7.9